Libraries

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.2
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(rstanarm)
## Warning: package 'rstanarm' was built under R version 4.2.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.2.2
## This is rstanarm version 2.21.3
## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
## - For execution on a local, multicore CPU with excess RAM we recommend calling
##   options(mc.cores = parallel::detectCores())
library(ggplot2)
library(data.table)
## Warning: package 'data.table' was built under R version 4.2.2
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(stringr)
library(ggsoccer)
## Warning: package 'ggsoccer' was built under R version 4.2.3
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.2
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.2.3
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(stringi)
## Warning: package 'stringi' was built under R version 4.2.2

Data Loading

We first want to load in the events data, which split across multiple csv files, grouped by country. To read this in, I have defined a loop that iterates through the data directory and appends each file to a data-frame.

#Define Data Directory
dir_path <- "../Data/events"

#Get a list of files in the directory
file_list <- list.files(dir_path)

#Create an empty data frame to store the file contents
actions <- data.frame()

#Loop through the files and add their contents to the data frame
for (i in 1:length(file_list)) {
  #Read the file into a data frame
  file_data <- fread(file.path(dir_path, file_list[i]))
  
  #Add the file data to the main data frame
  actions <- rbind(actions, file_data)
}
#View the events data frame
actions

We next want to load in the player data

#Read in data
players <- fromJSON("../Data/players/players.json")
#View player data
players

Since the event data contains all sorts of event types, we now want to filter out only the shots.

# Extract observations of shots from the actions data 
shots_df <- actions %>% dplyr::filter(subEventName == "Shot")

Event Data

The next section is dedicated to loading the event data

Categorical Data

We now use documentation from here to mutate the data, and garner more useful categorical data about the shot itself.

shots_df <- shots_df %>% 
  
  #If the shot is successful
  mutate('is_goal' = ifelse(grepl(" 101}", shots_df$tags),1,0), 
         
         #If the shot is at the end of a counter-attack
         'is_CA' = ifelse(grepl(" 1901}", shots_df$tags),1,0),
         
         #If the shot is with the foot or another part of the body
         'body_part' = ifelse(grepl(" 401}", shots_df$tags),"left",
                              ifelse(grepl(" 402}", shots_df$tags), "right", 
                                   ifelse(grepl(" 403}", shots_df$tags), "head/body", "NA"))),
         
         #If the shot is blocked
         'is_blocked' = ifelse(grepl(" 2101}", shots_df$tags), 1,0))

#Filter out only unblocked shots
shots_df <- shots_df %>% dplyr::filter(is_blocked == 0)

#Keep necessary categorical data
shots_cat <- dplyr::select(shots_df, c('playerId', 'is_goal', 'is_CA', 'body_part'))

summary(shots_cat)
##     playerId         is_goal           is_CA          body_part        
##  Min.   :     0   Min.   :0.0000   Min.   :0.00000   Length:32851      
##  1st Qu.: 11066   1st Qu.:0.0000   1st Qu.:0.00000   Class :character  
##  Median : 25707   Median :0.0000   Median :0.00000   Mode  :character  
##  Mean   : 93637   Mean   :0.1366   Mean   :0.05793                     
##  3rd Qu.:142755   3rd Qu.:0.0000   3rd Qu.:0.00000                     
##  Max.   :552555   Max.   :1.0000   Max.   :1.00000

Positional Data

We now calculate the distance from the goal and angle to goal for each shot.

First, we would like to define the position of each shot on the pitch

#Extract all numeric entries from the positions column
pos <- str_extract_all(gsub('"', "", shots_df$positions), "\\d+")

#Define vectors to store coordinates
x_pos <- c()
y_pos <- c()

#Loops that extract the coordinates
for (i in 1:length(pos)){
  x_pos <- append(x_pos, pos[[i]][2])
}

for (i in 1:length(pos)){
  y_pos <- append(y_pos, pos[[i]][1])
}

#Convert coordinates to numeric data
x_pos <- x_pos %>% as.numeric()
y_pos <- y_pos %>% as.numeric()
# Create coordinate dataframe
coords <- data.frame(x_pos, y_pos)

We can now use these coordinates to calculate distance and angle to goal

#Define length and width of pitch

pitch_x <- 105
pitch_y <- 68

#We now convert coordinates to meters
x_meter <- coords$x_pos * pitch_x/100
y_meter <- coords$y_pos * pitch_y/100

# Calculate distances
dist <- sqrt((105 - x_meter)^2 + ((32.5) - y_meter)^2)

#Calculate angles
angles <- atan( (7.32 * (105 - x_meter) ) / ( (105 - x_meter)^2 + (32.5 - y_meter)^2 - (7.32/2)^2 )) * 180/pi

We can now merge our useful event data into one data-frame.

#Concatenate data-frames
shots <- data.frame(shots_cat, dist, angles)

Player Data

The next section is dedicated to loading the player data. For this, we simply filter only by the features that will prove useful later on.

First lets retrieve a vector of all unique players in the current shots data-base:

#Retrieves unique player ids
player_list <- unique(shots$playerId)

We can now filter the players data-frame to only include these players

#Filters by player ids between both data frames
shooters <- dplyr::filter(players, wyId %in% player_list)

We can filter this data-frame by the features we need.

#Selects necessary columns
shooters <- dplyr::select(shooters, c('shortName', 'wyId', 'foot'))

Finally, we rename the some columns, for ease later on.

#Renames columns
colnames(shooters)[colnames(shooters) == "wyId"] <- "playerId"
colnames(shooters)[colnames(shooters) == "foot"] <- "preferred_foot"

Preferred Foot Data

We will now introduce a preferred foot binary variable.

First we merge all our useful data into one data-frame

#Concatenate data-frames
shots <- merge(shots, shooters, by = "playerId")

We now mutate this to add a column featuring the desired binary variable

#Adds preferred foot binary column
shots <- shots %>% 
  mutate(preferred_foot_b = ifelse(shots$preferred_foot == shots$body_part, 1, 0))

Finally, we remove the preferred_foot column

#Removes desired column
shots <- shots %>% dplyr::select(-c("preferred_foot"))

Data Cleaning/Wrangling

Since much of our data is categorical, it is necessary to convert it to the factor type.

#Convert necessary variables to factor 

shots$is_goal <- shots$is_goal %>% as.factor()

shots$body_part <- shots$body_part %>% as.factor()

shots$is_CA <- shots$is_CA %>% as.factor()

shots$preferred_foot_b <- shots$preferred_foot_b %>% as.factor()

shots$shortName <- shots$shortName %>% as.factor()

Now lets view a summary of our data

summary(shots)
##     playerId      is_goal   is_CA         body_part          dist       
##  Min.   :    36   0:28360   0:30946   head/body: 6645   Min.   :  0.54  
##  1st Qu.: 11066   1: 4489   1: 1903   left     :10225   1st Qu.: 11.56  
##  Median : 25707                       right    :15979   Median : 15.99  
##  Mean   : 93642                                         Mean   : 17.88  
##  3rd Qu.:142755                                         3rd Qu.: 24.22  
##  Max.   :552555                                         Max.   :103.97  
##                                                                         
##      angles                        shortName     preferred_foot_b
##  Min.   :-87.00   Cristiano Ronaldo     :  155   0:12623         
##  1st Qu.: 14.40   L. Insigne            :  139   1:20226         
##  Median : 19.64   H. Kane               :  137                   
##  Mean   : 23.54   E. D\\u017eeko        :  121                   
##  3rd Qu.: 30.59   L. Messi              :  121                   
##  Max.   : 89.66   I. Peri\\u0161i\\u0107:  117                   
##                   (Other)               :32059

Player Names

If we view a random subset of our data, we observe a problem decoding unicode characters:

shots[90:100,]

So we use the following chunk to decode them

shots$shortName <- stringi::stri_unescape_unicode(shots$shortName)

Negative Angles

We can see from the summary there are negative angles in the data, to investigate this further we can look at a histogram

hist(shots$angles)

We observe that there are multiple negative angles. Since most of the angles are correctly positive, we will remove the negative ones from the analysis.

shots <- shots %>% dplyr::filter(shots$angles > 0)

To see the corrected histogram

hist(shots$angles)

Player Downsampling

Later on we will use playerId to group the data. Since our data-set is large and spans many countries, there are many different players in the data-set

length(table(shots$playerId))
## [1] 2292

We see there are 2292 unique player included in the data-set

With this in hand, it would be sensible to limit the amount of “groups” (players) to, say 50. In order to preserve the greatest amount of data, we will use the top 50 most occurring player names.

top_players <- sort(table(shots$playerId), decreasing = T)[1:50]

Now we filter the data based on these players

top_shots <- dplyr::filter(shots, playerId %in% row.names(top_players))

We now view a summary of our final data

summary(top_shots)
##     playerId      is_goal  is_CA        body_part         dist       
##  Min.   :   122   0:3299   0:3716   head/body: 756   Min.   : 3.679  
##  1st Qu.:  7905   1: 727   1: 310   left     :1329   1st Qu.:10.974  
##  Median : 14945                     right    :1941   Median :14.749  
##  Mean   : 53939                                      Mean   :16.472  
##  3rd Qu.: 25716                                      3rd Qu.:20.832  
##  Max.   :447804                                      Max.   :98.898  
##      angles        shortName         preferred_foot_b
##  Min.   : 2.041   Length:4026        0:1539          
##  1st Qu.:15.430   Class :character   1:2487          
##  Median :22.007   Mode  :character                   
##  Mean   :25.839                                      
##  3rd Qu.:32.098                                      
##  Max.   :89.660

Data Exploration

To better understand our distance and angle data, we can create the following boxplots

#Defines and distance boxplot

dist_boxplot <- ggplot(top_shots, aes(x=is_goal, y=dist, fill = is_goal)) + 
                geom_boxplot() +
                labs(title="Distributions Of Distances Grouped By Shot Outcome", 
                     x="Shot Outcome", 
                     y="Distance To Goal (m)") + 
                coord_flip()

dist_boxplot <- dist_boxplot + guides(fill=guide_legend(title="Goal (1) or Not (0)"))

#Defines angles boxplot

angles_boxplot <- ggplot(top_shots, aes(x=is_goal, y=angles, fill = is_goal)) + 
                  geom_boxplot() + 
                  labs(title="Distributions Of Angles Grouped By Shot Outcome", 
                     x="Shot Outcome", 
                     y="Angle To Goal (Degrees)") +
                  coord_flip()

angles_boxplot <- angles_boxplot + guides(fill=guide_legend(title="Goal (1) or Not (0)"))

#Plots distance boxplot
dist_boxplot

#Plots angles boxplot
angles_boxplot

From these we observe there must is likely some relationship between the outcome of a shot and the position from which it is taken.

We now visualise the frequency at which each player takes a shot

Firstly, we need to wrangle the data a bit more:

#Create data-frame from top_players table defined earlier
top_players_df <- data.frame(top_players)

#Rename columns
colnames(top_players_df)[colnames(top_players_df) == "Var1"] <- "playerId"
colnames(top_players_df)[colnames(top_players_df) == "Freq"] <- "shotVolume"

#We add a column containing player name
top_players_df <- merge(top_players_df, distinct(top_shots[, c("playerId", "shortName")]), by = "playerId")

#We create a dataframe where the is_goal variable is numeric
numeric_goals <- top_shots[, c("shortName", "is_goal")]
numeric_goals$is_goal <- as.numeric(as.character(numeric_goals$is_goal))

#We sum up goals scored by player
summed_goals <- numeric_goals %>%
  group_by(shortName) %>% 
  summarise(goals = sum(is_goal))

#Merge to final data-frame
shots_goals <- merge(top_players_df, summed_goals, by = "shortName") 

#Sort in descending order by shot volume
shots_goals <- arrange(shots_goals, desc(shotVolume))
shots_goals$shortName <- shots_goals$shortName %>% as.factor()

Now we visualise the results

shots_goals_long <- gather(shots_goals, key = var, value = value, shotVolume, goals)

shots_goals_plot <- ggplot(shots_goals_long, aes(x=reorder(shortName, -value), y = value, fill = var)) +
                    geom_col(position = "identity", width = 0.9) +
                    labs(title="Shots And Goals By Player", 
                     x="Players", 
                     y="Volume") +
                    scale_x_discrete(guide = guide_axis(angle = 60))

shots_goals_plot <- shots_goals_plot + guides(fill=guide_legend(title="Key"))

shots_goals_plot

We can see from the plot, that there is some difference in a players ability to convert a shot. We can exploit this difference by adding another level to our models.

Fitting Models

Data Splitting

Since our data-set is somewhat small, it would be wise to have an uneven split of test and train data. This is carried out in the following chunk.

# Split into test and train subsets
train.size <- 0.8 * nrow(top_shots) 
train <- sample(1:nrow(top_shots), train.size)
test <- -train
shots.train <- top_shots[train, ]
shots.test <- top_shots[test, ]
is_goal.test <-  top_shots$is_goal[test]

Non-Baysian Models

is_goal ~ dist

First we will fit some simple logistic regression models

glm1 <- glm(is_goal ~ dist, data = top_shots, family = binomial())

summary(glm1)
## 
## Call:
## glm(formula = is_goal ~ dist, family = binomial(), data = top_shots)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0883  -0.6975  -0.5261  -0.2792   3.5203  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.226364   0.110297   2.052   0.0401 *  
## dist        -0.119491   0.007803 -15.313   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3802.7  on 4025  degrees of freedom
## Residual deviance: 3495.2  on 4024  degrees of freedom
## AIC: 3499.2
## 
## Number of Fisher Scoring iterations: 5

is_goal ~ dist + angles

glm2 <- glm(is_goal ~ dist + angles, data = top_shots, family = binomial())

summary(glm2)
## 
## Call:
## glm(formula = is_goal ~ dist + angles, family = binomial(), data = top_shots)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4259  -0.6616  -0.5080  -0.3281   3.1229  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.008302   0.283171  -3.561  0.00037 ***
## dist        -0.074710   0.011819  -6.321 2.60e-10 ***
## angles       0.020640   0.004428   4.661 3.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3802.7  on 4025  degrees of freedom
## Residual deviance: 3473.2  on 4023  degrees of freedom
## AIC: 3479.2
## 
## Number of Fisher Scoring iterations: 5

is_goal ~ .

glm3 <- glm(is_goal ~ . - shortName - playerId, data = top_shots, family = binomial())

summary(glm3)
## 
## Call:
## glm(formula = is_goal ~ . - shortName - playerId, family = binomial(), 
##     data = top_shots)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8279  -0.6449  -0.4795  -0.3011   3.3185  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.827369   0.313812  -5.823 5.78e-09 ***
## is_CA1             0.488807   0.150439   3.249  0.00116 ** 
## body_partleft      1.030124   0.146097   7.051 1.78e-12 ***
## body_partright     0.787840   0.171598   4.591 4.41e-06 ***
## dist              -0.091253   0.012458  -7.325 2.39e-13 ***
## angles             0.026604   0.004658   5.712 1.12e-08 ***
## preferred_foot_b1  0.209729   0.126546   1.657  0.09745 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3802.7  on 4025  degrees of freedom
## Residual deviance: 3370.6  on 4019  degrees of freedom
## AIC: 3384.6
## 
## Number of Fisher Scoring iterations: 5

Bayesian Linear Regression

Single-Level Models